Antibiotics gene linkage graph

Load generic libraries

source('configuration.r')

Load plot specific libraries

library(foreach)
library(readr)
suppressMessages(library(igraph))
library(ggraph)
library(ggalluvial)

Create graph edge data

dat <- read.table('../tables/antibiotics_gene_linkage.tsv', head=FALSE, stringsAsFactors = FALSE) %>% 
  mutate(id=paste0(V1,':',V2, ':', V3)) %>% 
  mutate(V6=str_replace(V6, 'PheCmlA5', 'Phe')) %>% 
  mutate(V6=str_replace(V6, 'Far1_Fcd', 'Far1_Bla'))

edges.full <- foreach(id=unique(dat$id), .combine=rbind) %do% {
    tmp <- dat[dat$id == id, ]
    if(nrow(tmp) < 2) {
        NULL
    }else{
        edge <- data.frame(t(combn(sort(tmp$V6), 2)), id=id)
        edge$dist <- foreach(r=1:nrow(edge), .combine = c) %do% {
          g1 <- sort(tmp[tmp$V6 == edge[r, ]$X1, 4:5])
          g2 <- sort(tmp[tmp$V6 == edge[r, ]$X2, 4:5])
          min(abs(g1[2]-g2[1]), abs(g1[1]-g2[2]))
        }
        edge
    }
}
## save for ad hoc analysis
write.table(edges.full, '../output_tables/ar_gene_linkage_edge_list_uncollapsed.tsv', row.names = F, quote=F, sep='\t')

Function for ploting

get_graph_data <- function(pattern='', reverse=FALSE, dist.fil=Inf){
  if(reverse){
    edges.collapsed <- filter(edges.full, !str_detect(id, pattern))
  }else{
    edges.collapsed <- filter(edges.full, str_detect(id, pattern)) 
  }
    
  ## filtering and collapse into edge weights
  edges.collapsed <- edges.collapsed %>% 
    group_by(X1, X2) %>% 
    summarise(n=length(dist), weight=sum(dist<dist.fil)) %>% 
    filter(n>1)   ### keep edges with more than 1 support
  
  ## Run MCL graph clustering
  write.table(edges.collapsed %>% filter(weight>0) %>% select(-n), 
              'tmp_edges.tsv', quote=F, sep='\t', col.names = F, row.names = F)
  system('/mnt/software/bin/mcl tmp_edges.tsv  --abc -o tmp_mcl.tsv -overlap keep -I 3 2>/dev/null')
  mcl <- read_tsv('tmp_mcl.tsv', col_names = F)
  grp <- foreach(c=1:nrow(mcl), .combine = 'rbind') %do%{
    tmp <- as.character(na.exclude(as.character(mcl[c,])))
    data.frame(vertex=tmp, grp=c, stringsAsFactors = F)
  } %>% data.frame(row.names = 1)
  grp$grp[grp$grp >= (which(table(grp) < 3)[1])] <- NA ## no annotation for clusters with 2 genes only
  
  ## generate graph data
  g <- graph_from_data_frame(edges.collapsed[,1:3], directed=FALSE)
  ## vertices
  colors <- pal_simpsons('springfield')(16)
  n.colors <- colors[as.numeric(as.factor(str_replace(V(g)$name, '.*_', '')))]
  V(g)$color <- n.colors
  V(g)$community <- grp[V(g)$name,]##cluster_walktrap(g)$membership
  V(g)$Type <- str_replace(V(g)$name, '.*_', '')
  ## Edges
  E(g)$width <- edges.collapsed$n
  E(g)$community <-
    apply(as.data.frame(get.edgelist(g, names=FALSE)), 1,
                      function(x)
                        ifelse(V(g)$community[x[1]] == V(g)$community[x[2]],
                                         LETTERS[V(g)$community[x[1]]], NA))
  ## format vertex names
  aux <- function(x){
      if(length(x) > 2){
          paste0(x[1],'_',x[2])
      }else{
          x[1]
      }
  }
  V(g)$name <- sapply(str_split(V(g)$name, '_'), aux)
  #### carbapenemase
  args <- read.table('../metadata/Bla_Carb.dat', head=F, stringsAsFactors=F)[,1]
  carb <- setNames(rep('No', length(V(g)$name)),V(g)$name)
  carb[names(carb) %in% args] <- 'Yes'
  V(g)$carb <- carb
  
  g
}

Main figure of all genes

g <- get_graph_data(dist.fil = 5e3)
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character()
## )
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 15 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual    file          expected   <int> <chr> <chr>      <chr>     <chr>         actual 1     4 <NA>  10 columns 6 columns 'tmp_mcl.tsv' file 2     5 <NA>  10 columns 5 columns 'tmp_mcl.tsv' row 3     6 <NA>  10 columns 5 columns 'tmp_mcl.tsv' col 4     7 <NA>  10 columns 5 columns 'tmp_mcl.tsv' expected 5     8 <NA>  10 columns 5 columns 'tmp_mcl.tsv'
## ... ................. ... ................................................ ........ ................................................ ...... ................................................ .... ................................................ ... ................................................ ... ................................................ ........ ................................................
## See problems(...) for more details.
g_cols <- c((pal_lancet("lanonc")(9))[c(2:8,1)], pal_ucscgb("default")(26))[-c(2,7,8,9,10,12,17,18,19,20,21,23,24,26,27)] 
## plot(1:19, 1:19, type="p", pch=65:(65+19-1), cex=2, col=g_cols) ## get legend alphabets
layout = create_layout(g, layout = 'igraph', algorithm = 'kk', kkconst=0.1)
p <- ggraph(layout)+#, circular=TRUE) +
  geom_edge_arc(aes(width=((!is.na(community))+1), alpha=as.factor(is.na(community)), color=community),
                curvature = 0.0,
                #alpha=0.7,
                end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm'))   +
  scale_edge_color_manual(values=g_cols, 
                          na.value = rgb(0.8,0.8,0.8,0.5)) +
  scale_edge_alpha_manual(values=c(0.8, 0.3), guide=FALSE) +
  geom_node_point(aes(fill=Type, color=carb),size=5, shape=21) +
  scale_color_manual(values=c('grey','firebrick')) + 
  geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
  scale_edge_width(range = c(1,1.5)) +
  scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
  theme_void() 
p

ggsave('../plots/fig3a.ar_gene_linkage_graph.pdf', p, width=15, height=17)

Sankey plot

Format data

edge.dat <- data.frame(v=V(g)$name, g=LETTERS[V(g)$community]) %>% filter(!is.na(g))

genome.info <- read.table('../tables/genome_info.dat', sep = '\t') 
merge(dat, genome.info, by=c(1,2), all.x=TRUE) %>% 
  filter(!is.na(V6.y) | V1=='plasmid') %>% ## only take the high/medium quality ones
  select(Species=V1, gene=V6.x) %>% mutate(gene=str_replace(gene, '_.*','')) -> sp.dat

sankey.dat <- 
  merge(sp.dat, edge.dat, by.x=2, by.y=1, all.y=T)  %>% 
  mutate(Species=str_replace(Species,'_',' ')) %>% 
  group_by(Species, g) %>% 
  count %>% 
  mutate(Antibiotics_cluster=as.character(g)) %>% 
  filter(n>3) %>% 
  ungroup() 

sankey.dat$Species <- factor(sankey.dat$Species, ordered = TRUE, levels = c("Acinetobacter baumannii",
"Acinetobacter sp.",
"Burkholderia lata",
"Enterobacter cloacae",
"Klebsiella pneumoniae",
"Pseudomonas aeruginosa",
"Corynebacterium striatum",
"Enterococcus faecalis",
"Enterococcus faecium",
"Enterococcus sp.",
"Staphylococcus aureus",
"Staphylococcus capitis",
"Staphylococcus cohnii",
"Staphylococcus epidermidis",
"Staphylococcus haemolyticus",
"Staphylococcus hominis",
"Staphylococcus lugdunensis",
"Staphylococcus warneri",
"plasmid"))

sankey.dat$Antibiotics_cluster <- factor(sankey.dat$Antibiotics_cluster, ordered=TRUE, levels=c(
  "A", "G",  "J", "K", "L", "M", "N", "B", "C", "D", "E", "F", "H", "I"
))

##  mutate(Species=sapply(Species, function(x) ifelse(x=='plasmid', NA, x)))

Plot

p <- ggplot(subset(sankey.dat, Species!='plasmid'),aes(y=log10(n), axis1=Species, axis2=Antibiotics_cluster)) +
  geom_alluvium(aes(fill=Antibiotics_cluster), width=1/5)  +
  geom_stratum(width=1/5, fill=NA, lwd=1) +
  geom_text(stat = "stratum", label.strata = TRUE, size=10, fontface = "bold", nudge_x=-0.12, hjust=1) +
  scale_fill_manual(values=g_cols[match(levels(sankey.dat$Antibiotics_cluster) , LETTERS[1:14])]) +
  scale_x_discrete(limits = c("Species", "Antibiotics Cluster"), expand = c(.5, .05)) +
  theme_void() +
  theme()
  
p

ggsave('../plots/fig3b.ar_gene_linkage_sankey.pdf', p, width=25, height=25)

Supplementary figures

Staphylococcus aureus network

g <- get_graph_data('Staphylococcus_aureus')
## Parsed with column specification:
## cols(
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_character(),
##   X7 = col_character(),
##   X8 = col_character(),
##   X9 = col_character(),
##   X10 = col_character(),
##   X11 = col_character(),
##   X12 = col_character(),
##   X13 = col_character(),
##   X14 = col_character(),
##   X15 = col_character(),
##   X16 = col_character(),
##   X17 = col_character(),
##   X18 = col_character(),
##   X19 = col_character()
## )
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
  geom_edge_arc(aes(width=width, color=community),
                curvature = 0.0,
                alpha=0.7,
                end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm'))   +
  scale_edge_color_manual(values=g_cols, 
                          na.value = rgb(0.8,0.8,0.8,0.5)) +
  geom_node_point(aes(fill=Type, color=carb),size=5, shape=21) +
  scale_color_manual(values=c('grey','firebrick')) + 
  geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
  scale_edge_width(range = c(0.5,3)) +
  scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
  theme_void()
p

ggsave('../plots/sup11.staph_aureus_ar_gene_linkage_graph.pdf', p, width=5, height=5)

Genome network

g <- get_graph_data('plasmid', reverse = TRUE)
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 9 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual     file          expected   <int> <chr> <chr>      <chr>      <chr>         actual 1     2 <NA>  43 columns 39 columns 'tmp_mcl.tsv' file 2     3 <NA>  43 columns 5 columns  'tmp_mcl.tsv' row 3     4 <NA>  43 columns 4 columns  'tmp_mcl.tsv' col 4     5 <NA>  43 columns 3 columns  'tmp_mcl.tsv' expected 5     6 <NA>  43 columns 3 columns  'tmp_mcl.tsv'
## ... ................. ... ................................................. ........ ................................................. ...... ................................................. .... ................................................. ... ................................................. ... ................................................. ........ .................................................
## See problems(...) for more details.
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
  geom_edge_arc(aes(width=width, color=community),
                curvature = 0.0,
                alpha=0.7,
                end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm'))   +
  scale_edge_color_manual(values=g_cols, 
                          na.value = rgb(0.8,0.8,0.8,0.5)) +
  geom_node_point(aes(fill=Type, color=carb),size= 5, shape=21) +
  scale_color_manual(values=c('grey','firebrick')) + 
  geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
  scale_edge_width(range = c(0.5,3)) +
  scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
  theme_void()
p

ggsave('../plots/sup12.genome_ar_gene_linkage_graph.pdf', p, width=16, height=16)

Plasmid network

g <- get_graph_data('plasmid')
## Parsed with column specification:
## cols(
##   .default = col_character()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 7 parsing failures.
## row # A tibble: 5 x 5 col     row col   expected   actual     file          expected   <int> <chr> <chr>      <chr>      <chr>         actual 1     2 <NA>  22 columns 13 columns 'tmp_mcl.tsv' file 2     3 <NA>  22 columns 9 columns  'tmp_mcl.tsv' row 3     4 <NA>  22 columns 8 columns  'tmp_mcl.tsv' col 4     5 <NA>  22 columns 6 columns  'tmp_mcl.tsv' expected 5     6 <NA>  22 columns 5 columns  'tmp_mcl.tsv'
## ... ................. ... ................................................. ........ ................................................. ...... ................................................. .... ................................................. ... ................................................. ... ................................................. ........ .................................................
## See problems(...) for more details.
p <- ggraph(g, layout = 'fr')+#, circular=TRUE) +
  geom_edge_arc(aes(width=width, color=community),
                curvature = 0.0,
                alpha=0.7,
                end_cap=circle(1.5, 'mm'), start_cap=circle(1.5, 'mm'))   +
  scale_edge_color_manual(values=g_cols, 
                          na.value = rgb(0.8,0.8,0.8,0.5)) +
  geom_node_point(aes(fill=Type, color=carb),size= as.numeric(as.factor(V(g)$carb) )*5, shape=21) +
  scale_color_manual(values=c('grey','firebrick')) + 
  geom_node_text(aes(label = name), size=5, repel = TRUE, fontface='bold') +
  scale_edge_width(range = c(0.5,3)) +
  scale_fill_manual(values=pal_ucscgb('default')(16)[-c(1,3,4)]) +
  theme_void()
p

ggsave('../plots/sup12.plasmid_ar_gene_linkage_graph.pdf', p, width=15, height=10)

Session informaton

sessionInfo()
## R version 3.4.4 (2018-03-15)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: CentOS release 6.9 (Final)
## 
## Matrix products: default
## BLAS: /usr/lib64/R/lib/libRblas.so
## LAPACK: /usr/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] ggalluvial_0.9.1 ggraph_1.0.2     igraph_1.2.2     readr_1.1.1     
##  [5] foreach_1.4.4    ggsci_2.9        reshape2_1.4.3   stringr_1.3.0   
##  [9] tibble_2.0.1     tidyr_0.8.2      dplyr_0.8.0.1    gridExtra_2.3   
## [13] ggplot2_3.1.0   
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.0        pillar_1.3.1      compiler_3.4.4   
##  [4] plyr_1.8.4        viridis_0.5.1     iterators_1.0.9  
##  [7] tools_3.4.4       digest_0.6.18     viridisLite_0.3.0
## [10] evaluate_0.10.1   gtable_0.2.0      pkgconfig_2.0.2  
## [13] rlang_0.3.1       cli_1.0.1         ggrepel_0.8.0    
## [16] yaml_2.1.18       withr_2.1.2       knitr_1.20       
## [19] hms_0.4.2         rprojroot_1.3-2   tidyselect_0.2.5 
## [22] glue_1.3.0        R6_2.4.0          fansi_0.4.0      
## [25] rmarkdown_1.9     farver_1.0        tweenr_1.0.0     
## [28] purrr_0.3.0       magrittr_1.5      units_0.6-1      
## [31] MASS_7.3-49       backports_1.1.2   scales_1.0.0     
## [34] codetools_0.2-15  htmltools_0.3.6   assertthat_0.2.0 
## [37] ggforce_0.1.3     colorspace_1.3-2  labeling_0.3     
## [40] utf8_1.1.4        stringi_1.3.1     lazyeval_0.2.1   
## [43] munsell_0.5.0     crayon_1.3.4